Loading functions made by Luke for later processing + making sure we are getting the conditions we want.
# The available conditions and their start and end frame indices
# to use the whole file, just set the value for the condition to c(NA, NA)
conditions <- list(
jointlead = c(NA, NA),
leadfollow = c(NA, NA)
)
# Calling functions
source("Functions.R")
And so, some more boring stuff…This is just getting the labels so we can make sure that the tracked markers match up with what we expected, really, this shouldn’t go wrong, but if you don’t check this kind of thing you’ll end up scratching your head later wondering why everything broke.
# load the labels from the XML file
# load the XML file
xmlfile <- xmlParse("PerAct23_LabelList.xml")
# get the labels, which are in the following format:
# <QTM_Label_List_Ver_1.00>
# <Trajectories>
# <Trajectory>
# <Name>A_head_top</Name>
# <Color R="0" G="147" B="0"/>
# </Trajectory>
# </Trajectories>
# </QTM_Label_List_Ver_1.00>
# get the trajectory names
traj_names <- xpathSApply(xmlfile, "//Trajectory/Name", xmlValue)
# get the trajectory colors
traj_colors <- xpathSApply(xmlfile, "//Trajectory/Color", xmlAttrs)
# convert the colors to hex
traj_colors <- rgb(
as.numeric(traj_colors[1,]),
as.numeric(traj_colors[2,]),
as.numeric(traj_colors[3,]),
alpha = 255,
maxColorValue = 255
)
# combine the names and colors into a data frame
traj_labels <- data.frame(
traj_names,
traj_colors,
stringsAsFactors = FALSE
)
rm(xmlfile, traj_names, traj_colors) #cleaning up
Finally loading in data hehehe (insert elmo meme with fire in the background)
Aditionally, this code is transforming data from a wide format, where each marker’s x, y, and z coordinates are in separate columns, to a long format where all the coordinates are in a single column. It then adds more information about each observation (subject, axis, and marker) before pivoting it back to a wide format.
## the code lists files in the specified directory that have names ending with ".tsv" and stores the list of file paths in the variable traj_files.
traj_files <- fs::dir_ls(data_dir, regexp = "\\.tsv$")
We make a list that contains all the individual dataframes. This list will be called on and iterated through throughout the next steps. (was called traj_data_list in lukes script. Here it is list_of_dataframes)
# Now we can actually load in the data. When we are loading in the data, we are also renaming our list to be the name of the individual groups and conditions
list_of_dataframes <- list()
for (file_path in traj_files) {
# Load the data from the file
traj_data <- process_qtm_tsv(file_path)
# Extract the group number and condition from the filename
group_number <- sub(".*group([0-9]+).*\\.tsv", "\\1", basename(file_path))
condition <- traj_data$metadata$condition
# Create a unique identifier for the combination of group number and condition
group_condition_identifier <- paste0("group", group_number, "_", condition)
# Check if a dataframe with this identifier already exists in the list
if (group_condition_identifier %in% names(list_of_dataframes)) {
# If it exists, append the data to the existing dataframe
list_of_dataframes[[group_condition_identifier]]$data <- rbind(list_of_dataframes[[group_condition_identifier]]$data, traj_data$data)
} else {
# If it doesn't exist, create a new dataframe and add it to the list
list_of_dataframes[[group_condition_identifier]] <- traj_data
}
}
# Now, list_of_dataframes contains individual lists for each unique combination of group number and condition
rm(group_condition_identifier,group_number)
This is so we can have nice individual dataframes later on for each group and condition
list_of_dataframes <- lapply(list_of_dataframes, function(x) {
# Add condition and group information to the data frame
x$data$condition <- x$metadata$condition
# Set x$data$group to be the list name
x$data$group <- basename(file_path)
return(x)
})
Making sure the list only contains the data from each group, and not the metadata.
# Assuming list_of_dataframes is a list of data frames with both 'data' and 'metadata'
for (i in seq_along(list_of_dataframes)) {
x <- list_of_dataframes[[i]]
# Check if 'data' and 'metadata' components exist in each list element
if (!all(c("data", "metadata") %in% names(x))) {
warning("List element does not have 'data' and/or 'metadata'. Skipping.")
next
}
# Extract condition and group information from the file path
file_path <- names(list_of_dataframes)[i]
condition <- sub(".*group[0-9]+_([^_\\.]+).*", "\\1", file_path)
group_number <- sub(".*group([0-9]+).*", "\\1", file_path)
# Add condition and group information to the data frame
x$data$condition <- condition
x$data$group <- paste0("group", group_number)
# Overwrite the original list element with the processed data
list_of_dataframes[[i]] <- x$data
}
# 'list_of_dataframes' now contains all the processed data frames
rm(x, condition, group_number, i) #Cleaning up
The columns “group” and “condition” shall be looked at as factors, and we are also performing quality control to make sure no markers have weird unique names.
library(dplyr)
library(stringi) # For stri_replace_last_regex
library(assertthat) # For assert_that
# Loop through each index of the list
for (i in seq_along(list_of_dataframes)) {
# Extract the current data frame directly from the list
df <- list_of_dataframes[[i]]
# Check if 'condition' and 'group' columns exist
if (!all(c("condition", "group") %in% names(df))) {
warning(paste("Data frame at index", i, "does not have 'condition' and/or 'group' columns. Skipping."))
next
}
# Add factors for condition and group
df$condition <- factor(df$condition)
df$group <- factor(df$group)
# Print the data frame index
cat("Data frame at index:", i, "\n")
# Print the first few rows of the data frame
print(head(df))
# Ensure all marker names are the same
marker_names <- unique(df %>% select(contains("_x")) %>% names() %>% stri_replace_last_regex("_x", ""))
assert_that(
all(marker_names == traj_labels$traj_names),
msg = paste("Not all marker names are the same in Data frame at index", i)
)
# Update the original data frame in the list
list_of_dataframes[[i]] <- df
cat("\n")
}
rm(df)
Final step before we can work on the dataframes. Currently, the
library(dplyr)
library(tidyr)
library(stringi) # For string manipulation
# Loop through each index of the list
for (i in seq_along(list_of_dataframes)) {
# Extract the current data frame directly from the list
df <- list_of_dataframes[[i]]
# Check if 'condition' and 'group' columns exist
if (!all(c("condition", "group") %in% names(df))) {
warning(paste("Data frame at index", i, "does not have 'condition' and/or 'group' columns. Skipping."))
next
}
# Pivot the data
df <- df %>%
pivot_longer(
cols = contains("_x") | contains("_y") | contains("_z"),
names_to = "marker",
values_to = "value"
) %>%
mutate(
subject = stri_replace_first_regex(marker, "^([AB])_.*", "$1"),
axis = stri_extract_last_regex(marker, "[xyz]$"),
marker = stri_replace_first_regex(marker, "^[AB]_([a-zA-Z_]+)_[xyz]$", "$1")
) %>%
# Move axes to columns
pivot_wider(
names_from = axis,
values_from = value
)
# Update the original data frame in the list
list_of_dataframes[[i]] <- df
# Print the data frame index
cat("Data frame at index:", i, "\n")
# Print the first few rows of the pivoted data frame
print(head(df))
cat("\n")
}
# Now 'list_of_dataframes' contains all the modified data frames
rm(df, i, marker_names)
# Loop through each dataframe in the list and arrange by 'elapsed_time'
list_of_dataframes <- lapply(list_of_dataframes, function(df) {
df %>% arrange(elapsed_time)
})
Now we have succesfully converted the files from Qualysis into dataframes that we can work on throughout the project, we will save each dataframe to a seperate folder. We also clean the environment of any values and dataframes that will not be useful going forward.
# Load necessary library
library(tidyverse)
# Check if the list is named; if not, you might need to assign names
if (is.null(names(list_of_dataframes))) {
warning("The list_of_dataframes is not named. Data frames will be saved with index-based names.")
names(list_of_dataframes) <- paste0("dataframe_", seq_along(list_of_dataframes))
}
# Create a sub-directory for the CSV files if it doesn't already exist
dir.create("data/raw_mocap_csv", showWarnings = FALSE, recursive = TRUE)
# Loop through each data frame in the list
for (df_name in names(list_of_dataframes)) {
# Define the file path and name for each CSV within the data/csv_files directory
file_path <- paste0("data/raw_mocap_csv/", df_name, ".csv")
# Save the data frame to a CSV file
write_csv(list_of_dataframes[[df_name]], file_path)
}
# Inform the user that the operation is complete
cat("All data frames have been saved in the 'data/csv_files' directory with their respective names.\n")
## All data frames have been saved in the 'data/csv_files' directory with their respective names.
A final clean before we work with our CSV files
#CLEANING YAY
rm(conditions, traj_data,traj_labels, i, marker_names, traj_files, data_dir, df_name, file_path, traj_files)
We are only interested in a short frame of time within all the recorded data. However, since we will run into issues if we trim before gapfilling, we will save into two seperate dataframes: one that is trimmed into 30 seconds but not gapfilled, and one that is gapfilled before trimming to 30 seconds. We can then use the non-gapfilled version to check for NAs, only within the timeframe we are actually interested in.
list_of_dataframes_unfilled <- list_of_dataframes
We now commence with gap filling one of the dataframes. Prior to doing this, we ran all the chunks in the NA-checking section to identify the most fitting markers; in order to save computational power, this section is now placed in the front to only visualise the relevant 30 seconds. For a detailed walkthrough of how we decided which markers to use, see following sections.
The reason why we gap-fill before trimming, is in order to account for all the NA’s that might exist in the intervals of trimming. This could result in sequences of NA’s towards the end and the beginning of the relevant sequence. When writing this document roughly 6000 NAs were eliminated when changing the order of gap-filling and trimming.
In order to later showcase the amount of NA’s in the other markers and to validify our choice of markers, the unfilled dataframe is also not filtered, and keeps data for every marker.
# get the markers of interest
markers_of_interest <- c(
"hand_right",
"head_left",
"hand_left",
"chest"
)
# Apply the filter to each dataframe in list_of_dataframes
list_of_dataframes <- lapply(list_of_dataframes, function(df) {
df %>% dplyr::filter(marker %in% markers_of_interest)
})
One of the lists is now used to gap-fill while the other stays empty:
# Assuming markers_of_interest is defined and has at least 4 elements
# Assuming gap_fill_linear is a function that performs linear gap filling
for (i in seq_along(list_of_dataframes)) {
# Access the dataframe
df <- list_of_dataframes[[i]]
# Extract a representative 'group' and 'condition' value (assuming they are consistent within each dataframe)
group_value <- unique(df$group)[1]
condition_value <- unique(df$condition)[1]
# Iterate over selected markers
for (sel_idx in 1:4) {
# Plot before gap-filling
plot_before <- df %>%
dplyr::filter(marker %in% markers_of_interest, marker == markers_of_interest[sel_idx]) %>%
ggplot(aes(x = elapsed_time, y = x, color = subject)) +
geom_line() +
facet_wrap(~condition) +
scale_color_manual(values= aesthetic_highlight_difference_palette)+
labs(
x = "Elapsed time",
y = "Marker X position",
title = paste("Marker", markers_of_interest[sel_idx], "X position before gap filling"),
subtitle = paste("Group:", group_value, " | ", "Condition:", condition_value)
)
print(plot_before)
# Apply the linear gap fill function to each column, by condition
df <- df %>%
dplyr::group_by(condition, subject, marker) %>%
dplyr::mutate(across(c(x, y, z), ~ gap_fill_linear(.)))
# Update the dataframe in the list
list_of_dataframes[[i]] <- df
# Plot after gap-filling
plot_after <- df %>%
dplyr::filter(marker %in% markers_of_interest, marker == markers_of_interest[sel_idx]) %>%
ggplot(aes(x = elapsed_time, y = x, color = subject)) +
geom_line() +
facet_wrap(~condition) +
scale_color_manual(values= aesthetic_highlight_difference_palette)+
labs(
x = "Elapsed time",
y = "Marker X position",
title = paste("Marker", markers_of_interest[sel_idx], "X position after gap filling"),
subtitle = paste("Group:", group_value, " | ", "Condition:", condition_value)
)
print(plot_after)
# Save the plot if needed
# ggsave(...) - add the appropriate ggsave call here if needed
}
}
# At this point, list_of_dataframes is updated with the gap-filled data
# Combine all dataframes into a single dataframe
combined_df <- dplyr::bind_rows(list_of_dataframes, .id = "group")
rm(plot_after, plot_before, df, sel_idx, i, condition_value, group_value)
Before proceeding we must trim the trajectories to accurately match the window of data collection - meaning, we must identify the 30 second window from which the t-pose starts.
Our original dataframes have a frame every 0,003rd second, so in order to save computational power, we simplify the dataframe for visualisation. In this case we will have points every half second. In doing so we can visually identify the point of t-pose start and end.
When we trim both frames, we effectively create a prior and posterior version of our data, which will later be used for NA-checking, euclidian distance, etc.
# Create a new list with processed dataframes without overwriting the original list
every_half_second_df_list <- lapply(list_of_dataframes, function(df) {
df %>%
group_by(subject, marker) %>%
slice(seq(1, n(), by = 151)) %>%
ungroup()
})
Now, every_half_second_df_list contains the simplified versions of the dataframes: the version with data every half second. Using this list, we can now make 3d plots with an included timeframe and identify the t-poses and effectively our 30 second window.
# Create an empty list to store the plots
plot_list <- list()
# Iterate through the simplified dataframes
for (i in seq_along(every_half_second_df_list)) {
df_name <- names(every_half_second_df_list)[i]
df <- every_half_second_df_list[[i]]
# Create the base plot
fig <- plotly::plot_ly(df,
x = ~x,
y = ~y,
z = ~z,
type = "scatter3d",
mode = "markers",
size = 2,
frame = ~elapsed_time,
marker = list(size = 4), ## Adjusting the marker size
color = ~subject)
fig <- fig %>% plotly::layout(
scene = list(
xaxis = list(title = "X-axis", range = c(-800, 800)),
yaxis = list(title = "Y-axis", range = c(-800, 800)),
zaxis = list(title = "Z-axis", range = c(0, 1800)),
aspectmode = "manual",
aspectratio = list(x = 1, y = 1, z = 1)
),
xaxis = list(title = "X-axis"),
yaxis = list(title = "Y-axis"),
zaxis = list(title = "Z-axis"))
fig <- fig %>% ## adding and changing text
plotly::layout(title = list(text = df_name, y = 0.9),
font=list(size=15, family = "Times new roman"),
legend = list(title = list(text = "markers")))
plot_list[[i]] <- fig
}
# Now plot_list contains all the plots for each simplified dataframe
# Create the 'results' directory if it doesn't exist
if (!dir.exists("results")) {
dir.create("results")
}
# Create the '3D plots' subdirectory inside 'results' if it doesn't exist
subdir <- "results/3D plots"
if (!dir.exists(subdir)) {
dir.create(subdir)
}
# Loop through the list of plots and save each one
for (i in seq_along(plot_list)) {
plot <- plot_list[[i]]
filename <- paste0(subdir, "/plot_", i, ".html") # Save files in the subdir
saveWidget(plot, file = filename)
}
plot_list
## [[1]]
##
## [[2]]
##
## [[3]]
##
## [[4]]
##
## [[5]]
##
## [[6]]
##
## [[7]]
##
## [[8]]
##
## [[9]]
##
## [[10]]
##
## [[11]]
##
## [[12]]
##
## [[13]]
##
## [[14]]
##
## [[15]]
##
## [[16]]
rm(df, fig, plot, i, df_name, filename, current_df)
Having manually identified the relevant time-frames, we create a new dataframe and input the relevant values of elapsed time.
start_time_df <- data.frame(
group_and_condition = c("group0_jointlead",
"group0_leadfollow",
"group1_jointlead",
"group1_leadfollow",
"group12_jointlead",
"group12_leadfollow",
"group13_jointlead",
"group13_leadfollow",
"group2_jointlead",
"group2_leadfollow",
"group3_jointlead",
"group3_leadfollow",
"group4_leadfollow",
"group6_jointlead",
"group6_leadfollow",
"group8_leadfollow"),
elapsed_time_at_start = as.numeric(c("17",
"35",
"12",
"50",
"33",
"28",
"37",
"94",
"18",
"31",
"12",
"42",
"37",
"14",
"29",
"52")
)
)
#We no longer need the simplified dataframe :)
rm(every_half_second_df_list)
Now we will iterate through all the group dataframes and pick the elapsed_time value associated. First we do this for the gap-filled list:
# Iterate through rows of start_time_df and extract subsets
for (i in 1:nrow(start_time_df)) {
df_name <- start_time_df$group_and_condition[i]
start_time <- start_time_df$elapsed_time_at_start[i]
# Find the corresponding dataframe in list_of_dataframes, if it exists
if (df_name %in% names(list_of_dataframes)) {
# Access the dataframe directly from the list
df <- list_of_dataframes[[df_name]]
# Arrange the dataframe by elapsed_time
df <- df %>% dplyr::arrange(elapsed_time)
# Extract the subset based on the starting value
starting_index <- which(df$elapsed_time == start_time)[1]
ending_index <- which(df$elapsed_time == start_time + 30)[1]
if (!is.na(starting_index) && !is.na(ending_index)) {
# Update the original dataframe in the list with the subset
list_of_dataframes[[df_name]] <- df[starting_index:ending_index, ]
}
# No else block needed, if indices are not found, the original dataframe remains unchanged
}
# No else block needed, if the dataframe does not exist in the list, nothing happens
}
# Now, list_of_dataframes contains the updated dataframes with the subsets
Next we will trim the unfilled list:
# Iterate through rows of start_time_df and extract subsets
for (i in 1:nrow(start_time_df)) {
df_name <- start_time_df$group_and_condition[i]
start_time <- start_time_df$elapsed_time_at_start[i]
# Find the corresponding dataframe in list_of_dataframes, if it exists
if (df_name %in% names(list_of_dataframes_unfilled)) {
# Access the dataframe directly from the list
df <- list_of_dataframes_unfilled[[df_name]]
# Arrange the dataframe by elapsed_time
df <- df %>% dplyr::arrange(elapsed_time)
# Extract the subset based on the starting value
starting_index <- which(df$elapsed_time == start_time)[1]
ending_index <- which(df$elapsed_time == start_time + 30)[1]
if (!is.na(starting_index) && !is.na(ending_index)) {
# Update the original dataframe in the list with the subset
list_of_dataframes_unfilled[[df_name]] <- df[starting_index:ending_index, ]
}
# No else block needed, if indices are not found, the original dataframe remains unchanged
}
# No else block needed, if the dataframe does not exist in the list, nothing happens
}
# Now, list_of_dataframes contains the updated dataframes with the subsets
#cleaning up again!
rm(df, df_name, ending_index, first_index, i, start_time, starting_index, start_time_df)
First we visualise the untrimmed dataset:
# Assuming markers_of_interest is defined and has at least 4 elements
# Assuming gap_fill_linear is a function that performs linear gap filling
for (i in seq_along(list_of_dataframes_unfilled)) {
# Access the dataframe
df <- list_of_dataframes_unfilled[[i]]
# Create a combined plot for each group and condition, faceted by marker
unique_groups <- unique(df$group)
unique_conditions <- unique(df$condition)
for (group in unique_groups) {
for (condition in unique_conditions) {
plot_combined <- df %>%
dplyr::filter(group == group, condition == condition, marker %in% markers_of_interest) %>%
ggplot(aes(x = elapsed_time, y = x, color = subject)) +
geom_line() +
scale_color_manual(values= aesthetic_highlight_difference_palette)+
facet_wrap(~marker, nrow = 2, ncol = 2) + # Arrange facets in a 2x2 grid by marker
labs(
x = "Elapsed time",
y = "Marker X position",
title = paste("Before Gap-filling"),
subtitle = paste("Group:", group, "| Condition:", condition, "- Marker X Positions")
)
print(plot_combined)
}
}
}
There are a couple of errors clearly visible in the plots, that will have an impact on our analysis. For instance, group 8 gets cut off too early. Meaning the recording was either stopped before it should have been, or has been cut short during the session afterwards.
Going through the plots defined in the previous section, it is clear that some sets have artifacts or entirely missing data where linear gap-filling is not a viable solution. Therefore we exclude these entirely. Omitting data like this, is not ideal, but we have chosen that we would rather have proper data for fewer groups than bad data for many groups. The following table will show the excluded groups, as well as the reasoning for excluding them:
| Group ID and condition | Markers | Reason for exclusion |
|---|---|---|
| Group 0 Joint lead | Chest | Subject A is missing the chest marker for ~80 of the time. |
| Group 12 Lead Follow | All | The only useable marker is chest, and that does not account for a lot of movement. |
| Group 2 Lead Follow | All | Missing data. |
| Group 13 BOTH | All | Incorrect labelling. |
We will exclude these from the dataframe going forward
#removing markers
list_of_dataframes$group0_jointlead %>%
filter(marker != "chest")
## # A tibble: 54,001 × 9
## # Groups: condition, subject, marker [6]
## index elapsed_time condition group marker subject x y z
## <dbl> <dbl> <fct> <fct> <chr> <chr> <dbl> <dbl> <dbl>
## 1 5101 17 jointlead group0 head_left A 572. -61.1 1673.
## 2 5101 17 jointlead group0 hand_right A 601. 696. 1363.
## 3 5101 17 jointlead group0 hand_left A 639. -662. 1376.
## 4 5101 17 jointlead group0 head_left B -369. 48.7 1754.
## 5 5101 17 jointlead group0 hand_right B -469. -724. 1367.
## 6 5101 17 jointlead group0 hand_left B NA NA NA
## 7 5102 17.0 jointlead group0 head_left A 572. -61.1 1673.
## 8 5102 17.0 jointlead group0 hand_right A 601. 696. 1363.
## 9 5102 17.0 jointlead group0 hand_left A 639. -662. 1376.
## 10 5102 17.0 jointlead group0 head_left B -369. 48.7 1754.
## # ℹ 53,991 more rows
#removing dataframes
list_of_dataframes[["group12_leadfollow"]] <- NULL
list_of_dataframes[["group2_leadfollow"]] <- NULL
list_of_dataframes[["group13_jointlead"]] <- NULL
list_of_dataframes[["group13_leadfollow"]] <- NULL
Then we visualise the data after gap-filling:
# Assuming markers_of_interest is defined and has at least 4 elements
# Assuming gap_fill_linear is a function that performs linear gap filling
for (i in seq_along(list_of_dataframes)) {
# Access the dataframe
df <- list_of_dataframes[[i]]
# Create a combined plot for each group and condition, faceted by marker
unique_groups <- unique(df$group)
unique_conditions <- unique(df$condition)
for (group in unique_groups) {
for (condition in unique_conditions) {
plot_combined <- df %>%
dplyr::filter(group == group, condition == condition, marker %in% markers_of_interest) %>%
ggplot(aes(x = elapsed_time, y = x, color = subject)) +
geom_line() +
scale_color_manual(values= aesthetic_highlight_difference_palette)+
facet_wrap(~marker, nrow = 2, ncol = 2) + # Arrange facets in a 2x2 grid by marker
labs(
x = "Elapsed time",
y = "Marker X position",
title = paste("After Gap-filling"),
subtitle = paste("Group:", group, "| Condition:", condition, "- Marker X Positions")
)
print(plot_combined)
}
}
}
As clarified prior to gap filling, the following sections were used to choose which markers to include in our analysis. In the following we will use the list of dataframes that have been cut to the relevant 30 seconds yet not gap-filled to give a thorough overview of the actually relevant amount of NAs.
For the most accurate analysis we wanted to include the following markers: Head, both hands and elbows, and chest. We checked for NA’s to verify none of these markers have too many NAs.
combined_df <- dplyr::bind_rows(list_of_dataframes_unfilled) %>%
dplyr::ungroup()
combined_df %>%
dplyr::summarise(Total_NAs = sum(is.na(x)))
## # A tibble: 1 × 1
## Total_NAs
## <int>
## 1 505680
combined_df %>%
dplyr::summarise(Total_NON_NAs = sum(!is.na(x)))
## # A tibble: 1 × 1
## Total_NON_NAs
## <int>
## 1 2518336
#NA
na_pre_gap <- combined_df %>%
group_by(marker) %>%
summarise(na_count = sum(is.na(x))) %>%
arrange(na_count)
#NON NA
non_na_pre_gap <-combined_df %>%
group_by(marker) %>%
summarise(non_na_count = sum(!is.na(x))) %>%
arrange(non_na_count)
print(na_pre_gap)
## # A tibble: 11 × 2
## marker na_count
## <chr> <int>
## 1 head_top 1994
## 2 head_left 16513
## 3 shoulder_right 17522
## 4 shoulder_left 26239
## 5 head_right 27069
## 6 chest 42472
## 7 hand_right 47092
## 8 hand_left 53092
## 9 elbow_right 76457
## 10 elbow_left 87607
## 11 back 109623
print(non_na_pre_gap)
## # A tibble: 11 × 2
## marker non_na_count
## <chr> <int>
## 1 head_top 142022
## 2 back 178377
## 3 elbow_left 200393
## 4 elbow_right 211543
## 5 hand_left 234908
## 6 hand_right 240908
## 7 chest 245528
## 8 head_right 260931
## 9 shoulder_left 261761
## 10 shoulder_right 270478
## 11 head_left 271487
rm(non_na_pre_gap, na_pre_gap)
We noticed that head_top didn’t have a lot of NA’s, so we checked whether or not it was the extra marker for A
list_of_dataframes_unfilled$group0_jointlead %>%
dplyr::filter(subject == "B") %>%
distinct(marker)
## # A tibble: 10 × 1
## marker
## <chr>
## 1 head_right
## 2 head_left
## 3 chest
## 4 back
## 5 shoulder_right
## 6 shoulder_left
## 7 elbow_right
## 8 elbow_left
## 9 hand_right
## 10 hand_left
## Subject B has no head_top heheheheheee
head_top does not appear as a marker in the list, given subject B was only given 2 dots on their head and A was given two; therefore, head_top is the “missing” marker and we therefore cannot use it to compare subjects.
# Combine all dataframes in list_of_dataframes into one dataframe
combined_df <- dplyr::bind_rows(list_of_dataframes_unfilled, .id = "group")
# Identifying columns with NAs and processing
na_columns <- colnames(combined_df)[colSums(is.na(combined_df)) > 0]
columns_to_summarize <- c("marker", na_columns, "group", "condition")
# Summarize the number of NAs for each marker
result <- combined_df %>%
dplyr::select(dplyr::all_of(columns_to_summarize)) %>%
dplyr::group_by(group, marker, condition) %>%
dplyr::summarise(across(everything(), ~ sum(is.na(.))), .groups = "drop") %>%
dplyr::arrange(marker, condition)
# Plotting with facet wrap
p <- ggplot(result, aes(x = marker, y = x, fill = marker)) +
geom_bar(stat = "identity", position = "dodge", show.legend = FALSE) +
labs(title = "NA's in Combined Dataframes", x = "Marker", y = "Number of NA") +
facet_wrap(~ condition) +
coord_flip()+
scale_fill_manual(values = aesthetic_palette)
# Print the combined plot
print(p)
rm(p, result, columns_to_summarize, na_columns)
# Now we can get the longest sequence of NAs for each marker
# Function to calculate the longest NA run length
max_na_run_length <- function(vec) {
rle_na <- rle(is.na(vec))
max(rle_na$lengths[rle_na$values], na.rm = TRUE, default = 0)
}
# Now we can get the longest sequence of NAs for each marker
longest_na_seq <- combined_df %>%
group_by(condition, marker) %>%
summarise(
x_length = max_na_run_length(x),
.groups = "drop"
)
# plot to check if it is acceptable
longest_na_seq %>%
ggplot(aes(x = marker, y = x_length, fill=marker)) +
geom_col(
show.legend = FALSE
) +
coord_flip() +
facet_wrap(c(~condition)) +
labs(
x = "Marker",
y = "Longest NA sequence",
title = "Longest NA sequence by condition"
)+
scale_fill_manual(values = aesthetic_palette)+
theme(axis.text.x = element_text(angle = 45, hjust = 1))
Unfortunately, the elbow markers have a lot of NA’s. According to our plots, the head markers are consistently low in NA’s, while the distribution of NA’s for hands and chest is not consistent throughout the groups, we continue with checking the euclydian distance to further verify the validity of choosing said markers. However, the head_top is only present for participant A, and therefore we must choose either head_left or head_right. Since head_right has fewer NA’s and the longest sequence is also smaller, we proceed with head_right
# Iterate directly over the dataframes in list_of_dataframes
for (df in list_of_dataframes_unfilled) {
# Calculate the euclidean distance between each marker (using x, y, z)
# We will do this by subject, marker, and axis
marker_distances <- df %>%
dplyr::group_by(subject, marker) %>%
dplyr::arrange(index) %>%
dplyr::mutate(
diff_x = x - dplyr::lag(x, 1),
diff_y = y - dplyr::lag(y, 1),
diff_z = z - dplyr::lag(z, 1)
) %>%
dplyr::mutate(
euclidean_distance = sqrt(diff_x^2 + diff_y^2 + diff_z^2)
)
# Plot the series for each marker, and see if anything stands out
plot <- ggplot(marker_distances, aes(x = index, y = euclidean_distance)) +
geom_line(aes(color = factor(marker)), linewidth = 1.25) +
theme_minimal() +
facet_wrap(~condition + subject) +
labs(
x = "Index",
y = "Euclidean distance",
title = "Euclidean distance from the previous frame by marker",
subtitle = paste( df$group[1], "Condition:", df$condition[1])
)
# Print the plot
print(plot)
}
rm(df, df_name, marker_distances, plot)
# Assuming list_of_dataframes is a list of dataframes
for (df in list_of_dataframes_unfilled) {
# Calculate the euclidean distance between each marker (using x, y, z)
# We will do this by subject, marker, and axis
marker_distances <- df %>%
dplyr::group_by(subject, marker) %>%
dplyr::arrange(index) %>%
dplyr::mutate(
diff_x = x - dplyr::lag(x, 1),
diff_y = y - dplyr::lag(y, 1),
diff_z = z - dplyr::lag(z, 1)
) %>%
dplyr::mutate(
euclidean_distance = sqrt(diff_x^2 + diff_y^2 + diff_z^2)
) %>%
# Filter only the specified markers
dplyr::filter(marker %in% c("head_right", "hand_right", "hand_left", "chest"))
# Plot the series for each marker, and see if anything stands out
plot <- ggplot(marker_distances, aes(x = index, y = euclidean_distance)) +
geom_line(aes(color = factor(marker)), linewidth = 1.25) +
theme_minimal() +
facet_wrap(~condition + subject) +
labs(
x = "Index",
y = "Euclidean distance",
title = "Euclidean distance from the previous frame by marker",
subtitle = paste("Group:", df$group[1], "Condition:", df$condition[1])
)+
scale_color_manual(values = c("#d8aeb5","#a94f62","#8d5b5a","#2f1a1b"))
# Print the plot
print(plot)
}
rm(df, marker_distances, plot)
(inspecting to further validate the selection of markers -> the markers chosen account for most of movement)
## Making a for loop that plots x-coordinates for all markers in all groups
for (df in list_of_dataframes_unfilled) {
# Create ggplot
plot <- ggplot(df, aes(x = x, y = marker, color = factor(marker))) +
geom_point() +
ggtitle(paste("Plot of x-coordinates for", df$group[1], df$condition[1]))+
scale_color_manual(values = aesthetic_palette)+
theme(legend.position = "none")
# Print the plot
print(plot)
}
rm(df, plot)
## Making a for loop that plots y-coordinates for all markers in all groups
for (df in list_of_dataframes_unfilled) {
# Create ggplot
plot <- ggplot(df, aes(x = marker, y = y, color = factor(marker))) +
geom_point() +
ggtitle(paste("Plot of y-coordinates for", df$group[1], df$condition[1]))+
scale_color_manual(values = aesthetic_palette)+
theme(legend.position = "none") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Print the plot
print(plot)
}
rm(df, plot)
# Combine all data frames into one
h <- bind_rows(list_of_dataframes_unfilled)
# Create ggplot
plot <- ggplot(h , aes(x = x, y = marker, color = factor(marker))) +
geom_point() +
ggtitle("x-coordinates plotted for all groups") +
scale_color_manual(values = aesthetic_palette) +
theme(legend.position = "none")
# Print the combined plot
print(plot)
# Combine all data frames into one
h <- bind_rows(list_of_dataframes_unfilled)
# Create ggplot
plot <- ggplot(h , aes(x = marker, y = y, color = factor(marker))) +
geom_point() +
ggtitle("y-coordinates plotted for all groups") +
scale_color_manual(values = aesthetic_palette) +
theme(legend.position = "none") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Print the combined plot
print(plot)
It is clear the quality of the dataset differs immensely, and some groups have significantly more NA’s than others.
# Apply the filter to each dataframe in list_of_dataframes
list_of_dataframes_unfilled <- lapply(list_of_dataframes_unfilled, function(df) {
df %>% dplyr::filter(marker %in% markers_of_interest)
})
#overwriting the combined
combined_df <- dplyr::bind_rows(list_of_dataframes_unfilled) %>%
dplyr::ungroup()
#Totals
combined_df %>%
dplyr::summarise(Total_NAs = sum(is.na(x)))
## # A tibble: 1 × 1
## Total_NAs
## <int>
## 1 159169
combined_df %>%
dplyr::summarise(Total_NON_NAs = sum(!is.na(x)))
## # A tibble: 1 × 1
## Total_NON_NAs
## <int>
## 1 992831
#NA
na_post_gap <- combined_df %>%
group_by(marker) %>%
summarise(na_count = sum(is.na(x)))
#NON NA
non_na_post_gap <- combined_df %>%
group_by(marker) %>%
summarise(non_na_count = sum(!is.na(x)))
print(non_na_post_gap)
## # A tibble: 4 × 2
## marker non_na_count
## <chr> <int>
## 1 chest 245528
## 2 hand_left 234908
## 3 hand_right 240908
## 4 head_left 271487
print(na_post_gap)
## # A tibble: 4 × 2
## marker na_count
## <chr> <int>
## 1 chest 42472
## 2 hand_left 53092
## 3 hand_right 47092
## 4 head_left 16513
rm(non_na_post_gap, na_post_gap)
#overwriting the combined with the gap filled from the list
combined_df <- dplyr::bind_rows(list_of_dataframes) %>%
dplyr::ungroup()
#Totals
combined_df %>%
dplyr::summarise(Total_NAs = sum(is.na(x)))
## # A tibble: 1 × 1
## Total_NAs
## <int>
## 1 29356
combined_df %>%
dplyr::summarise(Total_NON_NAs = sum(!is.na(x)))
## # A tibble: 1 × 1
## Total_NON_NAs
## <int>
## 1 834656
#NA
na_post_gap <- combined_df %>%
group_by(marker) %>%
summarise(na_count = sum(is.na(x)))
#NON NA
non_na_post_gap <- combined_df %>%
group_by(marker) %>%
summarise(non_na_count = sum(!is.na(x)))
print(non_na_post_gap)
## # A tibble: 4 × 2
## marker non_na_count
## <chr> <int>
## 1 chest 201590
## 2 hand_left 206844
## 3 hand_right 210210
## 4 head_left 216012
print(na_post_gap)
## # A tibble: 4 × 2
## marker na_count
## <chr> <int>
## 1 chest 14410
## 2 hand_left 9156
## 3 hand_right 5790
## 4 head_left 0
rm(non_na_post_gap, na_post_gap)
We still have a lot of NA’s caused by inadequate registering of markers during experiment. After gap-filling the total NAs have gone from 169725 to 93280.
We will continue analysis in a separate file
# Create a sub-directory for the CSV files if it doesn't already exist
dir.create("data/mocap_data_prepped", showWarnings = FALSE, recursive = TRUE)
# Loop through each data frame in the list
for (df_name in names(list_of_dataframes)) {
# Define the file path and name for each CSV within the data/csv_files directory
file_path <- paste0("data/mocap_data_prepped/", df_name, ".csv")
# Save the data frame to a CSV file
write_csv(list_of_dataframes[[df_name]], file_path)
}
# Inform the user that the operation is complete
cat("All data frames have been saved in the 'mocap_data_prepped' directory with their respective names.\n")
## All data frames have been saved in the 'mocap_data_prepped' directory with their respective names.
-------